Direct S3 Data Access: NetCDF - Daymet v4 Daily TMAX Example


Import Required Packages

%matplotlib inline
import matplotlib.pyplot as plt
from datetime import datetime
import os
import subprocess
import requests
from pystac_client import Client
import boto3
import s3fs
import pandas as pd
import numpy as np
import xarray as xr
import rasterio as rio
from rasterio.session import AWSSession
from rasterio.plot import show
import rioxarray
import geopandas
import pyproj
from pyproj import Proj
from shapely.ops import transform
import geoviews as gv
from cartopy import crs
import hvplot.xarray
import holoviews as hv
gv.extension('bokeh', 'matplotlib')
s3_cred_endpoint = {
    'podaac':'https://archive.podaac.earthdata.nasa.gov/s3credentials',
    'lpdaac':'https://data.lpdaac.earthdatacloud.nasa.gov/s3credentials',
    'ornldaac':'https://data.ornldaac.earthdata.nasa.gov/s3credentials'
}
def get_temp_creds():
    temp_creds_url = s3_cred_endpoint['ornldaac']
    return requests.get(temp_creds_url).json()
temp_creds_req = get_temp_creds()

Connect to NASA’s ORNL Cloud Provider

ornldaac_cat = Client.open('https://cmr.earthdata.nasa.gov/stac/ORNL_CLOUD/')

Search by collections and by datetime. Daymet Daily tmax are in netCDF format with 1 netCDF containing 1 years worth of daily data (i.e., 365 time steps)

search = ornldaac_cat.search(
    collections=['Daymet_Daily_V4_1840.v4.0'],
    datetime='2015/2020'
)

Each STAC Item return contains 21 netCDF files. The files represent weather variables for North America, Puerto Rico, and Hawaii

search.matched()
126
items = search.get_all_items()
#list(items)

Filter the items returned for North America and by tmax

https_urls = [list(i.assets.values())[0].href for i in items if '_na_' in i.id and '_tmax_' in i.id]
https_urls

Substitute a portion of the HTTPS url to create the S3 location

s3_urls = [x.replace('https://data.ornldaac.earthdata.nasa.gov/protected/', 's3://ornl-cumulus-prod-protected/') for x in https_urls]
s3_urls

Single file in-region direct S3 access of netcdf file

fs_s3 = s3fs.S3FileSystem(anon=False, key=temp_creds_req['accessKeyId'], secret=temp_creds_req['secretAccessKey'], token=temp_creds_req['sessionToken'])
s3_url = s3_urls[0]
s3_url
s3_file_obj = fs_s3.open(s3_url, mode='rb')
xr_ds = xr.open_dataset(s3_file_obj, chunks='auto', engine='h5netcdf')
xr_ds

Multi-file in-region direct S3 access of netcdf files

# Iterate through remote_files to create a fileset
fileset = [fs_s3.open(file) for file in s3_urls]
# This works...if you rerun this line and get a context manager error, try 1. rerunning the line above then this line 
xr_ts = xr.open_mfdataset(fileset, chunks='auto', engine='h5netcdf')
xr_ts
#xr_ts.SSH.hvplot.image()